{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# import"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import xml.etree.ElementTree as ET\n",
    "from Bio import SeqIO,Align"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# load data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## load MeSH disease"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "172968"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tree=ET.parse(\"./desc2024.xml\")\n",
    "dag={}\n",
    "for i in tree.findall(\"DescriptorRecord\"):\n",
    "    da=[]\n",
    "    for j in i.findall(\"ConceptList\")[0]:\n",
    "        for k in j.findall(\"TermList\")[0]:\n",
    "            if k.findall(\"String\")[0].text not in da:\n",
    "                da.append(k.findall(\"String\")[0].text)\n",
    "    TreeNumber=[]\n",
    "    for j in i.findall(\"TreeNumberList\"):\n",
    "        for jj in j.findall(\"TreeNumber\"):\n",
    "            TreeNumber.append(jj.text)\n",
    "    for j in range(len(da)):\n",
    "        ddag=np.unique(TreeNumber).tolist()\n",
    "        ddag1=[]\n",
    "        for k in ddag:\n",
    "            ddag1.append(list(k.split(\".\")))\n",
    "        ddag2=[]\n",
    "        for k in range(len(ddag)):\n",
    "            for kk in range(k+1,len(ddag)):\n",
    "                if ddag1[k][0]==ddag1[kk][0] and ddag1[k][-1]==ddag1[kk][-1]:\n",
    "                    ddag2.append(ddag[kk])\n",
    "        ddag2=np.unique(ddag2).tolist()\n",
    "        for k in ddag2:\n",
    "            ddag.pop(ddag.index(k))\n",
    "        dag[\" \".join(list(da[j].lower().split(\", \"))[::-1])]=ddag\n",
    "len(dag)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## load NONCODE lncRNA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "173112"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lnc_name=[]\n",
    "lnc_seq=[]\n",
    "for record in SeqIO.parse('./outLncRNA.fa','fasta'):\n",
    "    lnc_name.append(record.name)\n",
    "    lnc_seq.append(str(record.seq))\n",
    "len(lnc_name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## load miRBase miRNA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "38589"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mir_name=[]\n",
    "mir_seq=[]\n",
    "for record in SeqIO.parse('./miRNA.dat','embl'):\n",
    "    mir_name.append(record.name.lower())\n",
    "    mir_seq.append(str(record.seq))\n",
    "len(mir_name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## load MHDD disease & miRNA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2316, 1891, (53530,))"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hmdd_data=pd.read_excel('./alldata_v4.xlsx')\n",
    "hmdd_disease=hmdd_data['disease'].to_numpy()\n",
    "hmdd_miRNA=hmdd_data['miRNA'].to_numpy()\n",
    "#miRNA\n",
    "##align load\n",
    "axb=['EBV-mir-BART11','EBV-mir-BART12','EBV-mir-BART2','EBV-mir-BART4','EBV-mir-BART8','EBV-mir-BHRF1-2','EBV-mir-BHRF1-3','ebv-miR-BHRF1-2','ebv-miR-BHRF1-3','hcmv-miR-UL148d','hcmv-miR-UL22A','hcmv-miR-UL22a','hcmv-miR-US33','hcmv-mir-UL148D','hcmv-mir-UL70','kshv-miR-k12-1','kshv-miR-k12-11','kshv-miR-k12-4','kshv-mir-K12-1','kshv-mir-k12-1']\n",
    "bxb=['EBV-miR-BART11','EBV-miR-BART12','EBV-miR-BART2','EBV-miR-BART4','EBV-miR-BART8','EBV-miR-BHRF1-2','EBV-miR-BHRF1-3','EBV-miR-BHRF1-2','EBV-miR-BHRF1-3','hcmv-miR-UL148D','Hcmv-miR-UL22A','Hcmv-miR-UL22A','HCMV-miR-US33','hcmv-miR-UL148D','hcmv-miR-UL70','kshv-miR-K12-1','kshv-miR-K12-11','kshv-miR-K12-4','kshv-miR-K12-1','kshv-miR-K12-1']\n",
    "for i,j in enumerate(axb):\n",
    "   hmdd_miRNA[np.where(hmdd_miRNA==j)]=bxb[i]\n",
    "##align merge\n",
    "axb=['hsa-mir-1','hsa-mir-107','hsa-mir-1297','hsa-mir-133b','hsa-mir-137','hsa-mir-190b','hsa-mir-203a','hsa-mir-206','hsa-mir-217','hsa-mir-23c','hsa-mir-3118','hsa-mir-320c','hsa-mir-320d','hsa-mir-326','hsa-mir-346','hsa-mir-3666','hsa-mir-378b','hsa-mir-378c','hsa-mir-378e','hsa-mir-378f','hsa-mir-378h','hsa-mir-384','hsa-mir-421','hsa-mir-422a','hsa-mir-4262','hsa-mir-429','hsa-mir-4295','hsa-mir-4306','hsa-mir-4319','hsa-mir-4429','hsa-mir-4458','hsa-mir-4465','hsa-mir-448','hsa-mir-449a','hsa-mir-4500','hsa-mir-451a','hsa-mir-4644','hsa-mir-4770','hsa-mir-496','hsa-mir-520b','hsa-mir-520e','hsa-mir-543','hsa-mir-544a','hsa-mir-599','hsa-mir-613','hsa-mir-761']\n",
    "bxb=['hsa-miR-1','hsa-miR-107','hsa-miR-1297','hsa-miR-133b','hsa-miR-137','hsa-miR-190b','hsa-miR-203a','hsa-miR-206','hsa-miR-217','hsa-miR-23c','hsa-miR-3118','hsa-miR-320c','hsa-miR-320d','hsa-miR-326','hsa-miR-346','hsa-miR-3666','hsa-miR-378b','hsa-miR-378c','hsa-miR-378e','hsa-miR-378f','hsa-miR-378h','hsa-miR-384','hsa-miR-421','hsa-miR-422a','hsa-miR-4262','hsa-miR-429','hsa-miR-4295','hsa-miR-4306','hsa-miR-4319','hsa-miR-4429','hsa-miR-4458','hsa-miR-4465','hsa-miR-448','hsa-miR-449a','hsa-miR-4500','hsa-miR-451a','hsa-miR-4644','hsa-miR-4770','hsa-miR-496','hsa-miR-520b','hsa-miR-520e','hsa-miR-543','hsa-miR-544a','hsa-miR-599','hsa-miR-613','hsa-miR-761']\n",
    "for i,j in enumerate(axb):\n",
    "    if j in hmdd_miRNA:\n",
    "        hmdd_miRNA[np.where(hmdd_miRNA==j)]=bxb[i]\n",
    "hmdd_uqmiRNA=np.unique(hmdd_miRNA)\n",
    "hmdd_miRNA1=[i.lower() for i in hmdd_uqmiRNA]\n",
    "#disease\n",
    "for i,j in enumerate(hmdd_disease):\n",
    "    if '\\xa0' in j:\n",
    "        hmdd_disease[i]=j.replace('\\xa0',' ')\n",
    "## align load\n",
    "hmdd_disease[np.where(hmdd_disease=='sporatic amyotrophic lateral sclerosis')]='Amyotrophic Lateral Sclerosis'\n",
    "axb=['Acute Aortic Dissection','Acute Coronary Syndrome ','Acute Kidney Injury','Alzheimer Disease ','Asthma ','Brain Ischemia ','Chronic Kidney   Disease','Chronic Pelvic Pain Syndrome','Chronic Rhinosinusitis With Nasal  Polyps','Chronic Rhinosinusitis With Nasal Polyps','Chronic Rhinosinusitis With Nasal Polyps ','Coronary Artery Disease','Diabetic Retinopathy','Diabetic Retinopathy ','Dilated Cardiomyopathy ','Endothelial Dysfunction ','Gluteal Muscle Contracture','Heart Failure, Acute','Hemorrhagic Fever with Renal Syndrome','Hepatocellular Carcinoma, Hbv-Related','Inflammation, Chronic','Intracranial Aneurysm','Lumbar Disc Herniation','Metabolic Syndrome','Neointimal Hyperplasia','Osteoarthritis ','Pulmonary Embolism, Acute','Pulmonary Fibrosis, Idiopathic ','Renal Interstitial Fibrosis','Severe Asthma','Subarachnoid Hemorrhage ','T-cell acute lymphoblastic leukemia','Ulcerative   Colitis','acute myocardial infarction','colorectal adenocarcinoma','lacrimal gland adenoid cystic carcinoma','peripartum cardiomyopathy','renal fibrosis']\n",
    "bxb=['Acute Aortic   Dissection','Acute Coronary Syndrome','Acute  Kidney Injury','Alzheimer Disease','Asthma','Brain Ischemia','Chronic  Kidney Disease','Chronic   Pelvic Pain Syndrome','Chronic  Rhinosinusitis With Nasal Polyps','Chronic  Rhinosinusitis With Nasal Polyps','Chronic  Rhinosinusitis With Nasal Polyps','Coronary Artery  Disease','Diabetic  Retinopathy','Diabetic  Retinopathy','Cardiomyopathy, Dilated','Endothelial Dysfunction','Gluteal Muscle   Contracture','Acute Heart Failure','Hemorrhagic Fever With Renal Syndrome','HBV-related Hepatocellular Carcinoma','Chronic Inflammation','Intracranial  Aneurysm','Lumbar Disc   Herniation','Metabolic  Syndrome','Neointimal   Hyperplasia','Osteoarthritis','Acute Pulmonary Embolism','Idiopathic Pulmonary Fibrosis','Interstitial Fibrosis, Renal','Asthma, Severe','Subarachnoid Hemorrhage','Acute Lymphoblastic Leukemia, T-Cell','Colitis, Ulcerative','Myocardial Infarction, Acute','Colorectal   Adenocarcinoma','Carcinoma, Adenoid Cystic, Lacrimal Gland','Peripartum Cardiomyopathy ','Fibrosis, Renal']\n",
    "for i,j in enumerate(axb):\n",
    "   hmdd_disease[np.where(hmdd_disease==j)]=bxb[i]\n",
    "## align merge\n",
    "axb=['Autism spectrum disorder','DiGeorge syndrome','Muscular Dystrophy, Duchenne','Myositis, Inclusion Body','Prader-Willi syndrome','abdominal aortic aneurysm','acute myeloid leukemia','adenocarcinoma','aortic aneurysm','atherosclerosis','basal cell carcinoma','bipolar disorder','carcinoma','cleft lip','coronary artery disease','coronary disease','depression','embryonal carcinoma','endometriosis','fragile X syndrome','frailty','gastric cancer','glioblastoma','glioma','infertility','ischemic stroke','leukemia','liver injury','lung adenocarcinoma','lymphoma','melanoma','multiple sclerosis','myocardial infarction','myopia','nasopharyngeal carcinoma','neural tube defects','neuroblastoma','obesity','osteosarcoma','panic disorder','periodontitis','pheochromocytoma','plexiform neurofibroma','pre-eclampsia','psoriasis','rhabdomyosarcoma','rheumatoid arthritis','schizophrenia','squamous cell carcinoma']\n",
    "bxb=['Autism Spectrum Disorder','DiGeorge Syndrome','Duchenne muscular dystrophy','Inclusion body myositis','Prader-Willi Syndrome','Aortic Aneurysm, Abdominal','Leukemia, Myeloid, Acute','Adenocarcinoma','Aortic Aneurysm','Atherosclerosis','Carcinoma, Basal Cell','Bipolar Disorder','Carcinoma','Cleft Lip','Coronary Artery  Disease','Coronary Disease','Depression','Carcinoma, Embryonal','Endometriosis','Fragile X Syndrome','Frailty','Gastric   Cancer','Glioblastoma','Glioma','Infertility','Ischemic Stroke','Leukemia','Liver Injury','Lung  Adenocarcinoma','Lymphoma','Melanoma','Multiple Sclerosis','Myocardial Infarction','Myopia','Nasopharyngeal Carcinoma','Neural Tube Defects','Neuroblastoma','Obesity','Osteosarcoma','Panic Disorder','Periodontitis','Pheochromocytoma','Neurofibroma, Plexiform','Pre-Eclampsia','Psoriasis','Rhabdomyosarcoma','Arthritis, Rheumatoid','Schizophrenia','Carcinoma, Squamous Cell']\n",
    "for i,j in enumerate(axb):\n",
    "    if j in hmdd_disease:\n",
    "        hmdd_disease[np.where(hmdd_disease==j)]=bxb[i]\n",
    "hmdd_uqdisease=np.unique(hmdd_disease)\n",
    "hmdd_disease1=[]\n",
    "for i in hmdd_uqdisease:\n",
    "    j=\" \".join(list(i.lower().split(\", \"))[::-1])\n",
    "    j=list(j.split(\" \"))\n",
    "    while '' in j:\n",
    "        j.remove('')\n",
    "    hmdd_disease1.append(\" \".join(j))\n",
    "len(hmdd_disease1),len(hmdd_miRNA1),hmdd_disease.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## load lncRNASNP lncRNA & disease"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(167, (107,), (753,))"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "snp_ld_data=pd.read_csv('./lncrna-diseases_experiment.txt',delimiter='\\t')\n",
    "snp_disease=snp_ld_data['Disease'].to_numpy()\n",
    "snp_ld_lncRNA=snp_ld_data['lncRNA'].to_numpy()\n",
    "#lncRNA\n",
    "snp_ld_uqlncRNA=np.unique(snp_ld_lncRNA)\n",
    "#disease\n",
    "snp_disease[np.where(snp_disease=='Plexiform neurofibroma')]='plexiform neurofibroma'\n",
    "## align merge\n",
    "axb=['Autism spectrum disorder','DiGeorge syndrome','Muscular Dystrophy, Duchenne','Myositis, Inclusion Body','Prader-Willi syndrome','abdominal aortic aneurysm','acute myeloid leukemia','adenocarcinoma','aortic aneurysm','atherosclerosis','basal cell carcinoma','bipolar disorder','carcinoma','cleft lip','coronary artery disease','coronary disease','depression','embryonal carcinoma','endometriosis','fragile X syndrome','frailty','gastric cancer','glioblastoma','glioma','infertility','ischemic stroke','leukemia','liver injury','lung adenocarcinoma','lymphoma','melanoma','multiple sclerosis','myocardial infarction','myopia','nasopharyngeal carcinoma','neural tube defects','neuroblastoma','obesity','osteosarcoma','panic disorder','periodontitis','pheochromocytoma','plexiform neurofibroma','pre-eclampsia','psoriasis','rhabdomyosarcoma','rheumatoid arthritis','schizophrenia','squamous cell carcinoma']\n",
    "bxb=['Autism Spectrum Disorder','DiGeorge Syndrome','Duchenne muscular dystrophy','Inclusion body myositis','Prader-Willi Syndrome','Aortic Aneurysm, Abdominal','Leukemia, Myeloid, Acute','Adenocarcinoma','Aortic Aneurysm','Atherosclerosis','Carcinoma, Basal Cell','Bipolar Disorder','Carcinoma','Cleft Lip','Coronary Artery  Disease','Coronary Disease','Depression','Carcinoma, Embryonal','Endometriosis','Fragile X Syndrome','Frailty','Gastric   Cancer','Glioblastoma','Glioma','Infertility','Ischemic Stroke','Leukemia','Liver Injury','Lung  Adenocarcinoma','Lymphoma','Melanoma','Multiple Sclerosis','Myocardial Infarction','Myopia','Nasopharyngeal Carcinoma','Neural Tube Defects','Neuroblastoma','Obesity','Osteosarcoma','Panic Disorder','Periodontitis','Pheochromocytoma','Neurofibroma, Plexiform','Pre-Eclampsia','Psoriasis','Rhabdomyosarcoma','Arthritis, Rheumatoid','Schizophrenia','Carcinoma, Squamous Cell']\n",
    "for i,j in enumerate(axb):\n",
    "    if j in snp_disease:\n",
    "        snp_disease[np.where(snp_disease==j)]=bxb[i]\n",
    "snp_uqdisease=np.unique(snp_disease)\n",
    "snp_disease1=[i.lower() for i in snp_uqdisease]\n",
    "len(snp_disease1),snp_ld_uqlncRNA.shape,snp_ld_lncRNA.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## load lncRNASNP lncRNA & miRNA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((541,), (268,), (10597,))"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "snp_lm_data=pd.read_csv('./mirnas_lncrnas_validated.txt',delimiter='\\t',names=['lncRNA','miRNA'])\n",
    "snp_lm_lncRNA=snp_lm_data['lncRNA'].to_numpy()\n",
    "snp_miRNA=snp_lm_data['miRNA'].to_numpy()\n",
    "#lncRNA\n",
    "snp_lm_uqlncRNA=np.unique(snp_lm_lncRNA)\n",
    "#miRNA\n",
    "##align merge\n",
    "axb=['hsa-mir-1','hsa-mir-107','hsa-mir-1297','hsa-mir-133b','hsa-mir-137','hsa-mir-190b','hsa-mir-203a','hsa-mir-206','hsa-mir-217','hsa-mir-23c','hsa-mir-3118','hsa-mir-320c','hsa-mir-320d','hsa-mir-326','hsa-mir-346','hsa-mir-3666','hsa-mir-378b','hsa-mir-378c','hsa-mir-378e','hsa-mir-378f','hsa-mir-378h','hsa-mir-384','hsa-mir-421','hsa-mir-422a','hsa-mir-4262','hsa-mir-429','hsa-mir-4295','hsa-mir-4306','hsa-mir-4319','hsa-mir-4429','hsa-mir-4458','hsa-mir-4465','hsa-mir-448','hsa-mir-449a','hsa-mir-4500','hsa-mir-451a','hsa-mir-4644','hsa-mir-4770','hsa-mir-496','hsa-mir-520b','hsa-mir-520e','hsa-mir-543','hsa-mir-544a','hsa-mir-599','hsa-mir-613','hsa-mir-761']\n",
    "bxb=['hsa-miR-1','hsa-miR-107','hsa-miR-1297','hsa-miR-133b','hsa-miR-137','hsa-miR-190b','hsa-miR-203a','hsa-miR-206','hsa-miR-217','hsa-miR-23c','hsa-miR-3118','hsa-miR-320c','hsa-miR-320d','hsa-miR-326','hsa-miR-346','hsa-miR-3666','hsa-miR-378b','hsa-miR-378c','hsa-miR-378e','hsa-miR-378f','hsa-miR-378h','hsa-miR-384','hsa-miR-421','hsa-miR-422a','hsa-miR-4262','hsa-miR-429','hsa-miR-4295','hsa-miR-4306','hsa-miR-4319','hsa-miR-4429','hsa-miR-4458','hsa-miR-4465','hsa-miR-448','hsa-miR-449a','hsa-miR-4500','hsa-miR-451a','hsa-miR-4644','hsa-miR-4770','hsa-miR-496','hsa-miR-520b','hsa-miR-520e','hsa-miR-543','hsa-miR-544a','hsa-miR-599','hsa-miR-613','hsa-miR-761']\n",
    "for i,j in enumerate(axb):\n",
    "    if j in snp_miRNA:\n",
    "        snp_miRNA[np.where(snp_miRNA==j)]=bxb[i]\n",
    "snp_uqmiRNA=np.unique(snp_miRNA)\n",
    "snp_miRNA1=[i.lower() for i in snp_uqmiRNA]\n",
    "snp_lm_uqlncRNA.shape,snp_uqmiRNA.shape,snp_miRNA.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# merge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((2113,), (634,), (2430,))"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#miRNA\n",
    "merge_miRNA,midx=np.unique(np.concatenate([hmdd_uqmiRNA,snp_uqmiRNA],axis=0),return_index=True)\n",
    "merge_miRNA1=np.array(hmdd_miRNA1+snp_miRNA1)[midx]\n",
    "#lncRNA\n",
    "merge_lncRNA=np.unique(np.concatenate([snp_ld_uqlncRNA,snp_lm_uqlncRNA],axis=0))\n",
    "#disease\n",
    "merge_disease,didx=np.unique(np.concatenate([hmdd_uqdisease,snp_uqdisease],axis=0),return_index=True)\n",
    "merge_disease1=np.array(hmdd_disease1+snp_disease1)[didx]\n",
    "merge_miRNA1.shape,merge_lncRNA.shape,merge_disease1.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# match"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## match disease"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2077"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#first match\n",
    "match_disease={}\n",
    "match_t=[]\n",
    "for j,i in enumerate(merge_disease1):\n",
    "    if i in dag:\n",
    "        match_disease[merge_disease[j]]=dag[i]\n",
    "        match_t.append(i)\n",
    "for i in match_t:\n",
    "    j=np.where(merge_disease1==i)  \n",
    "    merge_disease1=np.delete(merge_disease1,j)\n",
    "    merge_disease=np.delete(merge_disease,j)\n",
    "#second match\n",
    "match_t=[]\n",
    "for l,i in enumerate(merge_disease1):\n",
    "    j=list(i.split(' '))\n",
    "    j.pop(0)\n",
    "    j=' '.join(j)\n",
    "    if j in dag:\n",
    "        tempseq=dag[j]\n",
    "        tempseq1=[]\n",
    "        kk=1\n",
    "        for k in tempseq:\n",
    "            tempseq1.append(k+'.-'+str(kk))\n",
    "            kk+=1\n",
    "        match_disease[merge_disease[l]]=tempseq1\n",
    "        match_t.append(i)\n",
    "for i in match_t:\n",
    "    j=np.where(merge_disease1==i)  \n",
    "    merge_disease1=np.delete(merge_disease1,j)\n",
    "    merge_disease=np.delete(merge_disease,j)\n",
    "#third\n",
    "match_t=[]\n",
    "for l,i in enumerate(merge_disease1):\n",
    "    j=list(i.split(' '))\n",
    "    j.pop(-1)\n",
    "    j=' '.join(j)\n",
    "    if j in dag:\n",
    "        tempseq=dag[j]\n",
    "        tempseq1=[]\n",
    "        kk=1\n",
    "        for k in tempseq:\n",
    "            tempseq1.append(k+'.-'+str(kk))\n",
    "            kk+=1\n",
    "        match_disease[merge_disease[l]]=tempseq1\n",
    "        match_t.append(i)\n",
    "for i in match_t:\n",
    "    j=np.where(merge_disease1==i)  \n",
    "    merge_disease1=np.delete(merge_disease1,j)\n",
    "    merge_disease=np.delete(merge_disease,j)\n",
    "len(match_disease)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## match miRNA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1245"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "match_miRNA={}\n",
    "match_t=[]\n",
    "for j,i in enumerate(merge_miRNA1):\n",
    "    if i in mir_name:\n",
    "        match_miRNA[merge_miRNA[j]]=mir_seq[mir_name.index(i)]\n",
    "        match_t.append(i)\n",
    "for i in match_t:\n",
    "    j=np.where(merge_miRNA1==i)  \n",
    "    merge_miRNA1=np.delete(merge_miRNA1,j)\n",
    "    merge_miRNA=np.delete(merge_miRNA,j)\n",
    "len(match_miRNA)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## match lncRNA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "557"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "match_lncRNA={}\n",
    "for j,i in enumerate(merge_lncRNA):\n",
    "    if i in lnc_name:\n",
    "        match_lncRNA[i]=lnc_seq[lnc_name.index(i)]\n",
    "len(match_lncRNA)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# screen"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((1245, 2077), 23337.0, (1245, 557), 1438.0, (2077, 557), 320.0)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "miRNA_name=np.array(list(match_miRNA.keys()))\n",
    "np.save('../miRNA_name.npy',miRNA_name)\n",
    "disease_name=np.array(list(match_disease.keys()))\n",
    "np.save('../disease_name.npy',disease_name)\n",
    "lncRNA_name=np.array(list(match_lncRNA.keys()))\n",
    "np.save('../lncRNA_name.npy',lncRNA_name)\n",
    "#hmdd\n",
    "hmdd_disease2,hmdd_miRNA2=[],[]\n",
    "for i in range(len(hmdd_disease)):\n",
    "    if hmdd_disease[i] in match_disease and hmdd_miRNA[i] in match_miRNA:\n",
    "        hmdd_disease2.append(hmdd_disease[i])\n",
    "        hmdd_miRNA2.append(hmdd_miRNA[i])\n",
    "mir_dis_m=np.zeros((miRNA_name.shape[0],disease_name.shape[0]))\n",
    "for i in range(len(hmdd_disease2)):\n",
    "    mir_dis_m[np.where(miRNA_name==hmdd_miRNA2[i]),np.where(disease_name==hmdd_disease2[i])]=1\n",
    "np.save(\"../miRNA_disease.npy\",mir_dis_m)\n",
    "#lncrnasnp_lm\n",
    "snp_lm_lncRNA2,snp_miRNA2=[],[]\n",
    "for i in range(len(snp_lm_lncRNA)):\n",
    "    if snp_lm_lncRNA[i] in match_lncRNA and snp_miRNA[i] in match_miRNA:\n",
    "        snp_lm_lncRNA2.append(snp_lm_lncRNA[i])\n",
    "        snp_miRNA2.append(snp_miRNA[i])\n",
    "mir_lnc_m=np.zeros((miRNA_name.shape[0],lncRNA_name.shape[0]))\n",
    "for i in range(len(snp_lm_lncRNA2)):\n",
    "    mir_lnc_m[np.where(miRNA_name==snp_miRNA2[i]),np.where(lncRNA_name==snp_lm_lncRNA2[i])]=1\n",
    "np.save(\"../miRNA_lncRNA.npy\",mir_lnc_m)\n",
    "#lncrnasnp_ld\n",
    "snp_ld_lncRNA2,snp_disease2=[],[]\n",
    "for i in range(len(snp_ld_lncRNA)):\n",
    "    if snp_ld_lncRNA[i] in match_lncRNA and snp_disease[i] in match_disease:\n",
    "        snp_ld_lncRNA2.append(snp_ld_lncRNA[i])\n",
    "        snp_disease2.append(snp_disease[i])\n",
    "dis_lnc_m=np.zeros((disease_name.shape[0],lncRNA_name.shape[0]))\n",
    "for i in range(len(snp_ld_lncRNA2)):\n",
    "    dis_lnc_m[np.where(disease_name==snp_disease2[i]),np.where(lncRNA_name==snp_ld_lncRNA2[i])]=1\n",
    "np.save(\"../disease_lncRNA.npy\",dis_lnc_m)\n",
    "mir_dis_m.shape,mir_dis_m.sum(),mir_lnc_m.shape,mir_lnc_m.sum(),dis_lnc_m.shape,dis_lnc_m.sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# calculate"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## calculate disease sim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2077, 2077)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "delta=0.5\n",
    "DAs={}\n",
    "for i in match_disease:\n",
    "    DA=1\n",
    "    for j in match_disease[i]:\n",
    "        k=list(j.split('.'))\n",
    "        DA=DA+ 1-delta**(len(k)-1)\n",
    "    DAs[i]=DA\n",
    "DVs=[]\n",
    "for i in disease_name:\n",
    "    DV=[]\n",
    "    for j in disease_name:\n",
    "        if i==j or match_disease[i]==match_disease[j]:\n",
    "            DV.append(1)\n",
    "        else:\n",
    "            DVup=0\n",
    "            unidag=[]\n",
    "            for ii in match_disease[i]:\n",
    "                ik=list(ii.split('.'))\n",
    "                for jj in match_disease[j]:\n",
    "                    jk=list(jj.split('.'))\n",
    "                    if ik[0]==jk[0]:\n",
    "                        ij=0\n",
    "                        while ij<len(ik) and ij <len(jk) and ik[ij]==jk[ij]:\n",
    "                            ij+=1\n",
    "                        sij=0\n",
    "                        for iu in unidag:\n",
    "                            iuj=0\n",
    "                            while iuj<len(iu) and iuj <len(ik) and ik[iuj]==iu[iuj]:\n",
    "                                iuj+=1\n",
    "                            if sij<iuj:\n",
    "                                sij=iuj\n",
    "                        if sij==0:\n",
    "                            unidag.append(ik[:ij])\n",
    "                        else:\n",
    "                            sij-=1\n",
    "                        ij-=1\n",
    "                        if sij<ij:\n",
    "                            if ij==len(ik)-1:\n",
    "                                DVup=DVup+(2-delta**(ij-sij))+(1-delta**(ij-sij+1))*delta**(len(jk)-2-ij)\n",
    "                            elif ij==len(jk)-1:\n",
    "                                DVup=DVup+(1-delta**(ij-sij+1))*delta**(len(ik)-2-ij)+(2-delta**(ij-sij))\n",
    "                            else:\n",
    "                                DVup=DVup+(1-delta**(ij-sij+1))*delta**(len(ik)-2-ij)+(1-delta**(ij-sij+1))*delta**(len(jk)-2-ij)\n",
    "                    else:\n",
    "                        break\n",
    "            DV.append(DVup/(DAs[i]+DAs[j]))\n",
    "    DVs.append(DV)\n",
    "DVs=np.array(DVs)\n",
    "np.save('../disease_disease.npy',DVs)\n",
    "DVs.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## calculate miRNA seq sim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1245, 1245)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ASs=[]\n",
    "aligner=Align.PairwiseAligner()\n",
    "aligner.mode='global'\n",
    "match_miRNA_lit=[]\n",
    "for i in miRNA_name:\n",
    "    match_miRNA_lit.append(match_miRNA[i])\n",
    "for i in range(len(miRNA_name)):\n",
    "    first_seq=match_miRNA_lit[i]\n",
    "    AS=[]\n",
    "    for j in range(len(miRNA_name)):\n",
    "        second_seq=match_miRNA_lit[j]\n",
    "        alignments=aligner.align(first_seq,second_seq)\n",
    "        AS.append(alignments[0].score/alignments[0].shape[1])\n",
    "    ASs.append(AS)\n",
    "ASs=np.array(ASs)\n",
    "np.save('../miRNA_miRNA.npy',ASs)\n",
    "ASs.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## calculate lncRNA seq sim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(557, 557)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ASs=[]\n",
    "match_lncRNA_lit=[]\n",
    "for i in lncRNA_name:\n",
    "    match_lncRNA_lit.append(match_lncRNA[i])\n",
    "for i in range(len(lncRNA_name)):\n",
    "    first_seq=match_lncRNA_lit[i]\n",
    "    AS=[]\n",
    "    for j in range(len(lncRNA_name)):\n",
    "        second_seq=match_lncRNA_lit[j]\n",
    "        alignments=aligner.align(first_seq,second_seq)\n",
    "        AS.append(alignments[0].score/alignments[0].shape[1])\n",
    "    ASs.append(AS)\n",
    "ASs=np.array(ASs)\n",
    "np.save('../lncRNA_lncRNA.npy',ASs)\n",
    "ASs.shape"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pytorch",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
